Primary Analysis - Bringing Everything Together
After finishing priliminary analysis, we were finished with understanding and cleaning the datasets. Next we decided to see how the different economic traits [Income, Price of Goods, Retail Sales, and Household Spending] relate with each other].
Limitations Faced
- First limitation we identified was the missing links between the datasets. The Pricing dataset, even though had hundreds of products but it could be only liked with other datasets for two categories namely Food and Fuel. Hence Sales and Spending datasets, even though they linked with each other just fine, could only link with pricing using Food and Fuel category.
- Limited knowledge of statistical techniques to implement.
Primary Analysis 1.1 - Retail Sales vs Household Spending
Comparing trends we see that even though sales for clothing and accessories, and fuel continued to increase, there was a sharp fall in amount spent by households and it remained lower for most of 2014 to 2016 period.
temp_hst_exp<- df_hs_hst %>%
select(1,2,3,5) %>%
rename(expenditure_type=household_expenditures_summary_level_categories,spending=value,year=ref_date) %>%
filter(household_type!="All classes") %>%
filter(expenditure_type %in% c("Food expenditures","Shelter","Communications","Clothing and accessories","Transportation","Health care","Personal care","Recreation","Tobacco products and alcoholic beverages","Miscellaneous expenditures","Gas and other fuels (all vehicles and tools)")) %>% group_by(expenditure_type,year) %>% summarise(mean(spending)) %>% clean_names()
temp_hst_exp<- temp_hst_exp %>% rename(mean_spending_dollars=mean_spending)
temp_df_rsv<-df_rsv_pri %>% rename(mean_sales_millions=mean_value_in_dollar_million)
temp_df_rsv$sales_type<-as.character(df_rsv_pri$sales_type)
for (i in 1:nrow(temp_df_rsv))
{
if(str_detect(temp_df_rsv$sales_type[i],"Furniture")==TRUE)
{
temp_df_rsv$sales_type[i]<-"Shelter"
}
else if (str_detect(temp_df_rsv$sales_type[i],"Clothing")==TRUE)
{
temp_df_rsv$sales_type[i]="Clothing and accessories"
}
else if (str_detect(temp_df_rsv$sales_type[i],"Food")==TRUE)
{
temp_df_rsv$sales_type[i]="Food expenditures"
}
else if (str_detect(temp_df_rsv$sales_type[i],"Gasoline")==TRUE)
{
temp_df_rsv$sales_type[i]="Gas and other fuels (all vehicles and tools)"
}
else if (str_detect(temp_df_rsv$sales_type[i],"Health")==TRUE)
{
temp_df_rsv$sales_type[i]="Health and personal care"
}
else if (str_detect(temp_df_rsv$sales_type[i],"Miscellaneous")==TRUE)
{
temp_df_rsv$sales_type[i]="Miscellaneous expenditures"
}
else
{
temp_df_rsv$sales_type[i]=temp_df_rsv$sales_type[i]
}
}
temp_df_rsv$sales_type<-as.factor(temp_df_rsv$sales_type)
temp_rsv_food<- temp_df_rsv %>% filter(sales_type=="Food expenditures") %>% rename(mean_sf_m=mean_sales_millions)
temp_rsv_fuel<- temp_df_rsv %>% filter(sales_type=="Gas and other fuels (all vehicles and tools)") %>% rename(mean_sfu_m=mean_sales_millions)
# Merged Datasets
temp_rsv_hst<-merge(temp_df_rsv,temp_hst_exp,by.x=c("sales_type","year"),by.y=c("expenditure_type","year"))Primary Analysis 1.1.1 - Retail Sales vs Household Spending
p10<- temp_rsv_hst %>%
ggplot(
aes(
x = year,
y = mean_sales_millions,
colour = sales_type
)
) +
geom_point() +
labs(x = 'Year',
y = 'Value in Million Dollars',
colour = 'Retail Sales of Different Products',
title = 'Primary Graph 1.1.1a - Retail Sales') +
scale_y_continuous() +
geom_smooth(method = 'loess') +
facet_wrap(~sales_type,ncol=2,scale="free")
p11<- temp_rsv_hst %>%
ggplot(
aes(
x = year,
y = mean_spending_dollars,
colour = sales_type
)
) +
geom_point() +
labs(x = 'Year',
y = 'Value in Unit Dollars',
colour = 'Household Spending of Different Products',
title = 'Primary Graph 1.1.1b - Household Spending of Products') +
scale_y_continuous() +
geom_smooth(method = 'loess') +
facet_wrap(~sales_type,ncol=2,scale="free")
ggarrange(p10,p11,nrow=2,common.legend = TRUE,legend = "right" )Primary Analysis 1.2 - Fuel and Food - Price vs Sales vs Household Spending
From Pricing dataset, we had fuel and food as the common link between the three datasets. Hence we were able to go ahead with performing a linear regression to check how changes in Price affected Sales and Household Spending.
temp_sar_food<- df_hs_sar %>% filter(size_of_area_of_residence=="All classes" & household_expenditures_summary_level_categories=="Food expenditures") %>% select(3,1,5) %>% rename(spending_type=household_expenditures_summary_level_categories,year=ref_date,mean_fsp_du=value)
temp_sar_fuel<- df_hs_sar %>% filter(size_of_area_of_residence=="All classes" & household_expenditures_summary_level_categories=="Gas and other fuels (all vehicles and tools)") %>% select(3,1,5) %>% rename(spending_type=household_expenditures_summary_level_categories,year=ref_date,mean_fusp_du=value)
temp_pri_ph<-df_pri_ma %>% mutate(
product_high=case_when(
str_detect(products,"juice|Apples|Oranges|Bread|milk|cheese|oil|Potatoes|salmon|beef|Wieners|Tea|Coffee|drinks")==TRUE~'Food and beverage',
str_detect(products,"gasoline")==TRUE~'Fuel',
TRUE~'Not Needed'
)) %>% select(1,3,4) %>% filter(product_high!="Not Needed")
temp_pri_food<- temp_pri_ph %>% filter(product_high=="Food and beverage")
temp_pri_food$product_high<-as.factor(temp_pri_food$product_high)
temp_pri_food$year<- as.factor(substring(temp_pri_food$year,1,4))
temp_pri_food$product_high<-as.factor(temp_pri_food$product_high)
temp_pri_food<-temp_pri_food %>% group_by(product_high,year) %>% summarise(sum(value_in_dollar_unit)) %>% clean_names() %>% rename(sum_fp_d_u=sum_value_in_dollar_unit)
temp_pri_food$sum_fp_d_u<-round(temp_pri_food$sum_fp_d_u,2)
temp_pri_food$year<-as.numeric(as.character(temp_pri_food$year))
temp_pri_fuel<- temp_pri_ph %>% filter(product_high=="Fuel")
temp_pri_fuel$product_high<-as.factor(temp_pri_fuel$product_high)
temp_pri_fuel$year<- as.factor(substring(temp_pri_fuel$year,1,4))
temp_pri_fuel$product_high<-as.factor(temp_pri_fuel$product_high)
temp_pri_fuel<-temp_pri_fuel %>% group_by(product_high,year) %>% summarise(mean(value_in_dollar_unit)) %>% clean_names() %>% rename(mean_fup_d_u=mean_value_in_dollar_unit)
temp_pri_fuel$mean_fup_d_u<-round(temp_pri_fuel$mean_fup_d_u,2)
temp_pri_fuel$year<-as.numeric(as.character(temp_pri_fuel$year))
## Dataset for fuel prices, sales, and expenditures
temp<- merge(temp_pri_fuel[,c(2:3)],temp_rsv_fuel[,c(2:3)],by="year")
temp_fuel<- merge(temp[,c(1:3)],temp_sar_fuel[,c(2:3)],by="year")
temp<- merge(temp_pri_fuel[,c(2:3)],temp_rsv_fuel[,c(2:3)],by="year")
temp_fuel<- merge(temp[,c(1:3)],temp_sar_fuel[,c(2:3)],by="year")
temp_fuel$year<-as.factor(temp_fuel$year)
temp<- merge(temp_pri_food[,c(2:3)],temp_rsv_food[,c(2:3)],by="year")
temp_food<- merge(temp[,c(1:3)],temp_sar_food[,c(2:3)],by="year")
temp_food$year<-as.factor(temp_food$year)Primary Analysis 1.2.1 - Fuel - Price vs Sales vs Spending
# Price vs Spending - fuel
b1<-coef(lm(mean_sfu_m~mean_fup_d_u, data = temp_fuel))
b2<-coef(lm(mean_fusp_du~mean_fup_d_u, data = temp_fuel))
p12<-temp_fuel %>%
ggplot(
aes(
x = mean_fup_d_u,
y = mean_sfu_m
)
) + coord_flip()+
geom_point(size=2) +
labs(x = 'Fuel Prices in Cents/Lts',
y = 'Retail Sales in Million Dollar',
title = 'Fuel Prices vs Retail Sales [2012 - 2017]') +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
geom_smooth(method = 'lm',se=FALSE) +
geom_abline(
slope = b1[2],
intercept = b1[1],
colour = 'red',
size = 3,
alpha = 0.1
)
p13<- temp_fuel %>%
ggplot(
aes(
x = mean_fup_d_u,
y = mean_fusp_du
)
) + coord_flip()+
geom_point(size=2) +
labs(x = 'Fuel Prices in Cents/Lts',
y = 'Household Spending in Dollar/Unit',
title = 'Fuel Prices vs Household Spending [2012 - 2017]') +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
geom_smooth(method = 'lm',se=FALSE) +
geom_abline(
slope = b2[2],
intercept = b2[1],
colour = 'red',
size = 3,
alpha = 0.1
)
ggarrange(p12,p13)Primary Analysis 1.2.2 - Food - Price vs Sales vs Spending
# Price vs Spending - Food
b3<-coef(lm(mean_sf_m~sum_fp_d_u, data = temp_food))
b4<-coef(lm(mean_fsp_du~sum_fp_d_u, data = temp_food))
p14<- temp_food %>%
ggplot(
aes(
x = sum_fp_d_u,
y = mean_sf_m
)
) + coord_flip()+
geom_point(size=2) +
labs(x = 'Food Prices in Dollar/Unit',
y = 'Retail Sales in Million Dollar',
title = 'Food Prices vs Retail Sales [2012 - 2017]') +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
geom_smooth(method = 'lm',se=FALSE) +
geom_abline(
slope = b3[2],
intercept = b3[1],
colour = 'red',
size = 3,
alpha = 0.1
)
p15<- temp_food %>%
ggplot(
aes(
x = sum_fp_d_u,
y = mean_fsp_du
)
) + coord_flip()+
geom_point(size=2) +
labs(x = 'Food Prices in Dollar/Unit',
y = 'Household Spending in Dollar/Unit',
title = 'Food Prices vs Household Spending [2012 - 2017]') +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
geom_smooth(method = 'lm',se=FALSE) +
geom_abline(
slope = b4[2],
intercept = b4[1],
colour = 'red',
size = 3,
alpha = 0.1
)
ggarrange(p14,p15)Primary Analysis 1.3 - Correlogram
Primary Analysis 1.3.1 - Correlogram - Price vs Sales vs Spending
df_corr<-merge(temp_food,temp_fuel,by="year") %>% rename(food_pri_du=sum_fp_d_u,food_sales_dm=mean_sf_m,food_spen_du=mean_fsp_du,fuel_pri_dl=mean_fup_d_u,sales_fuel_dm=mean_sfu_m,spen_fuel_dl=mean_fusp_du)
df_corr$year<-as.numeric(as.character(df_corr$year))
corr <- round(cor(df_corr), 1)
# Plotting Graph to check correlation between different variables
p16<- ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram between Price, Sales and Spending",
ggtheme=theme_bw)
p16